Data input

set.seed("260823")

source("Simulation_model_code/estimate_relatedness.R")
source("Simulation_model_code/simulation_model.R")

if (!dir.exists("Numerical_results")) dir.create("Numerical_results")
if (!dir.exists("Numerical_results/Figures")) dir.create("Numerical_results/Figures")
if (!dir.exists("Numerical_results/Simulated_data")) dir.create("Numerical_results/Simulated_data")

RUN_ALL <- TRUE

MAX_CLONAL_CLUSTER <- 4  # subgraph size for relationship graph
N_MARKER_DENSE <- 24000  # number of markers for dense simulated data
N_MARKER_SPARSE <- 200   # number of markers for sparse data (polymorphic, downsampled from dense)
N_LINEAGES <- 100        # number of unrelated founder lineages
N_SAMPLES <- 100         # size of simulated parasite population
N_GENERATIONS <- 10      # number of generations of inbreeding
PROB_COTRANS <- 0.4      # enriched probability of outbreeding within subgraphs
M <- 1000                 # av consecutive markers inherited from a single parent
P <- 0.9                 # expected frequency of major allele in founder population 

N_MARKER_DOWNSAMPLE <- seq(500, 12000, 500)
N_MARKER_DOWNSAMPLE_SUBSET <- c(500, 1000, 1500, 2000, 4000, 8000, 12000)
# number of downsampled (polymorphic) markers to illustrate HMM vs independence model tradeoff

Run simulation model

Simulate successive generations of inbreeding, capturing dense genotypic data whilst accommodating marker linkage

if (RUN_ALL || !file.exists("Numerical_results/Simulated_data/dense_data_simulation.rds")) {
  dense_data_sim <- simulate_inbreeding(max_clonal_cluster=MAX_CLONAL_CLUSTER, 
                                           n_markers=N_MARKER_DENSE, 
                                           n_lineages=N_LINEAGES, 
                                           n_samples=N_SAMPLES, 
                                           n_generations=N_GENERATIONS, 
                                           prob_cotrans=PROB_COTRANS, 
                                           M=M, p=P)
  write_rds(dense_data_sim, "Numerical_results/Simulated_data/dense_data_simulation.rds",
            compress="gz")
} else {
  dense_data_sim <- read_rds("Numerical_results/Simulated_data/dense_data_simulation.rds")
}

Generate relatedness estimates

Generate relatedness estimates for the subset of polymorphic markers within the dense simulated dataset across each generation

pairwise_classifications <- lapply(1:N_GENERATIONS, function(i) {
  classify_pairs(dense_data_sim[["relatedness_graphs"]][[i]],
                 dense_data_sim[["enriched_cotrans_components"]][[i]])}) %>% 
  bind_rows(.id="generation")

if (RUN_ALL || !file.exists("Numerical_results/Simulated_data/dense_data_sim_ibd_estimates.rds")) {
  dense_data_ibd <- estimate_relatedness_sim(dense_data_sim, 
                                             loci_to_keep=paste0("m", 1:N_MARKER_DENSE), 
                                             generations=1:N_GENERATIONS)
  write_rds(dense_data_ibd, "Numerical_results/Simulated_data/dense_data_sim_ibd_estimates.rds", compress="gz")
} else {
  dense_data_ibd <- read_rds("Numerical_results/Simulated_data/dense_data_sim_ibd_estimates.rds")
}

polymorphic_marker_info <- lapply(dense_data_sim[["locus_metrics"]], function(x) {
  x %>% subset(prop_pair_ibs<1) %>% mutate(maf=(1-sqrt(2*prop_pair_ibs-1))/2)
})

Generate a sparse dataset by downsampling polymorphic markers and generate relatedness estimates across each generation

sparse_marker_subsets <- lapply(polymorphic_marker_info, function(x) {
  paste0("m", sort(as.numeric(gsub("m", "", sample(rownames(x), N_MARKER_SPARSE)))))})

locus_metrics_sparse <- 
  lapply(1:N_GENERATIONS, function(i) dense_data_sim[["locus_metrics"]][[i]][sparse_marker_subsets[[i]],])

sparse_data_ibd <- list()
for (i in 1:N_GENERATIONS) {
  if (RUN_ALL || !file.exists(paste0("Numerical_results/Simulated_data/sparse_data_sim_ibd_estimates_G", i, ".rds"))) {
    sparse_data_ibd[[i]] <- estimate_relatedness_sim(dense_data_sim,
                                                     loci_to_keep=sparse_marker_subsets[[i]],
                                                     generations=i)
    write_rds(sparse_data_ibd[[i]], paste0("Numerical_results/Simulated_data/sparse_data_sim_ibd_estimates_G", i, ".rds"), compress="gz")
  }
  else {
    sparse_data_ibd[[i]] <- read_rds(paste0("Numerical_results/Simulated_data/sparse_data_sim_ibd_estimates_G", i, ".rds"))
  }
}

Downsample markers at the final simulated generation of inbreeding (N_GENERATIONS=10)

N_MARKER_DOWNSAMPLE <- subset(N_MARKER_DOWNSAMPLE, N_MARKER_DOWNSAMPLE<=min(sapply(polymorphic_marker_info, nrow)))
N_MARKER_DOWNSAMPLE_SUBSET <- subset(N_MARKER_DOWNSAMPLE_SUBSET, N_MARKER_DOWNSAMPLE_SUBSET<=min(sapply(polymorphic_marker_info, nrow)))

downsample_marker_subsets <- lapply(N_MARKER_DOWNSAMPLE, function(n_subset) {
  paste0("m", sort(as.numeric(gsub("m", "", sample(rownames(polymorphic_marker_info[[N_GENERATIONS]]), 
                                                     n_subset)))))})

names(downsample_marker_subsets) <- paste0("N_", N_MARKER_DOWNSAMPLE)

downsampled_data_ibd <- list()
for (i in names(downsample_marker_subsets)) {
  if (RUN_ALL || !file.exists(paste0("Numerical_results/Simulated_data/downsampled_data_sim_ibd_estimates_G_", N_GENERATIONS, "_", i, ".rds"))) {
    downsampled_data_ibd[[i]] <- estimate_relatedness_sim(dense_data_sim,
                                                          loci_to_keep=downsample_marker_subsets[[i]],
                                                          generations=N_GENERATIONS)
    write_rds(downsampled_data_ibd[[i]], paste0("Numerical_results/Simulated_data/downsampled_data_sim_ibd_estimates_G_", N_GENERATIONS, "_", i, ".rds"), compress="gz")
  }
  else {
    downsampled_data_ibd[[i]] <- read_rds(paste0("Numerical_results/Simulated_data/downsampled_data_sim_ibd_estimates_G_", N_GENERATIONS, "_", i, ".rds"))
  }
}

downsampled_data_ibd_summary <- 
  lapply(downsampled_data_ibd, function(x) x[["ibd_ibs_est"]]) %>% 
  bind_rows(.id="random_subset")

Numerical results

Summary of pairwise relatedness over successive generations of inbreeding

Misspecification of the standard nIBD-to-IBS model over successive generations of inbreeding

Relatedness estimates under (n)IBD independence and the corrected nIBD-to-IBS model over successive generations of inbreeding

Joyplots of relatedness estimates for siblings over successive generations of inbreeding

Bias in pairwise relatedness estimates as a function of marker density (for downsampled data)

Pairwise relatedness estimates as a function of marker density (for downsampled data)

Error in pairwise relatedness estimates under the HMM as a function of marker density (for downsampled data)

Zero inflation in pairwise relatedness estimates

Rationale for dense data elbow diagnostic